Code
# We load the MAE object
mae <- load_latest_mae(str_c(data_dir(), "03_augmented_mae/"))
clin <- MultiAssayExperiment::colData(mae) |> as.data.frame() |> as_tibble()# We load the MAE object
mae <- load_latest_mae(str_c(data_dir(), "03_augmented_mae/"))
clin <- MultiAssayExperiment::colData(mae) |> as.data.frame() |> as_tibble()target_visits <- c(4, 7)
arms <- c("Placebo", "LACTIN-V")
conf.level <- 0.95In this document, we investigate whether the effects of Lactin-V vary by baseline microbiota composition or participant’s characteristics. The aim is to identify groups of participants that may benefit more than others, or to identify groups of participants that may be harmed by the product and for which caution should be taken in future trials.
We consider the following binary outcomes (\(Y\)):
Lactobacillus crispatus colonization (≥ 50% L. crispatus) at week 12 or at week 24 (the primary and secondary outcomes of this study), or
rBV by week 12 (any rBV before or at week 12) or by week 24 (the primary and secondary outcomes of the original Phase IIb study).
Y_Lc <-
get_assay_wide_format(mae, "mb_categories_wide") |>
filter(AVISITN %in% target_visits, ARM %in% arms) |>
select(USUBJID, AVISITN, assay) |>
mutate(
week = ifelse(AVISITN == 4, 12, 24),
Lc = assay$`≥ 50% L. crispatus`
) |>
pivot_wider(
id_cols = USUBJID, names_from = week, values_from = Lc, names_prefix = "Lc_week_"
) |>
arrange(USUBJID)
Y_rBV <-
clin |>
filter(ARM %in% arms, AVISITN > 1) |>
select(USUBJID, AVISITN, BV) |>
arrange(USUBJID, AVISITN) |>
distinct()
Y_rBV <-
Y_rBV |>
full_join(
expand_grid(
USUBJID = Y_rBV$USUBJID |> unique(),
AVISITN = c(4,7)
),
by = join_by(USUBJID, AVISITN)
) |>
mutate(BV = BV |> str_replace_na("Missing")) |>
group_by(USUBJID) |>
mutate(
n_rBV = cumsum(BV == "Yes"),
n_no_rBV = cumsum(BV == "No")
) |>
ungroup() |>
filter(AVISITN %in% target_visits) |>
mutate(
rBV =
case_when(
n_rBV > 0 ~ 1, # "rBV by" if any BV by this week
(n_rBV == 0) & (n_no_rBV > 0) ~ 0, # "no rBV by" if no BV and at least one observation until then.
TRUE ~ NA_integer_ # if no observation until then -> NA
),
week = ifelse(AVISITN == 4, 12, 24)
) |>
pivot_wider(
id_cols = USUBJID, names_from = week, values_from = rBV, names_prefix = "rBV_week_"
)Y <- full_join(Y_Lc, Y_rBV, by = join_by(USUBJID))
rm(Y_Lc, Y_rBV)Y |>
pivot_longer(cols = -USUBJID, names_to = "outcome", values_to = "value") |>
dplyr::count(outcome, value) |>
pivot_wider(names_from = outcome, values_from = n, values_fill = 0) |>
dplyr::rename(outcome = value) |>
mutate(outcome = outcome |> as.character() |> str_replace_na("Missing")) |>
gt(caption = "Number of participants per outcome")| outcome | Lc_week_12 | Lc_week_24 | rBV_week_12 | rBV_week_24 |
|---|---|---|---|---|
| 0 | 137 | 122 | 122 | 103 |
| 1 | 42 | 43 | 73 | 92 |
| Missing | 17 | 31 | 1 | 1 |
outcomes <- c("Lc_week_12", "Lc_week_24", "rBV_week_12", "rBV_week_24")The treatment (\(A\)) is LACTIN-V vs. placebo.
A <-
clin |> filter(ARM %in% arms) |> select(USUBJID, ARM) |> distinct() |>
mutate(ARM = ARM |> fct_relevel(arms))
AY <- full_join(A, Y, by = join_by(USUBJID))
rm(A, Y)The numbers of participant per considered outcomes in each arm are:
AY |>
pivot_longer(cols = -c(USUBJID, ARM), names_to = "outcome", values_to = "value") |>
mutate(outcome = outcome |> fct_inorder()) |>
filter(!is.na(value)) |>
dplyr::count(outcome, ARM) |>
pivot_wider(names_from = ARM, values_from = n, names_prefix = "n ", values_fill = 0) |>
gt(caption = "Number of participants in each arm per considered outcomes.")| outcome | n Placebo | n LACTIN-V |
|---|---|---|
| Lc_week_12 | 56 | 123 |
| Lc_week_24 | 52 | 113 |
| rBV_week_12 | 61 | 134 |
| rBV_week_24 | 61 | 134 |
The main objective of this analysis is to investigate if baseline microbiota is associated with differential benefit.
Here, we chose to represent baseline microbiota by the pre-MTZ topic composition (topic relative abundances) rather than all taxa or ASVs relative abundances to reduce the dimensionality of the data.
Since most Lactobacillus species, with the exception of L. iner, have a very low prevalence at baseline, we collapsed Lactobacillus-dominated topics into a single topic (which includes all Lactobacillus spp.)
In addition, since we consider relative abundances of topics, the proportion of any topic can be computed from the others, so we arbitrarily exclude one topic from the models so that no explanatory variable is a linear combination of the others. We chose to exclude the last one as it is the least prevalent.
V_topics <-
get_assay_wide_format(mae, "c_topics_16S_8") |>
filter(AVISITN == 0, ARM %in% arms) |>
dplyr::rename(topic = assay) |>
select(USUBJID, topic) |>
mutate(
topic = topic |> set_colnames(SummarizedExperiment::rowData(mae[["c_topics_16S_8"]])$topic_label)
)
if (any(duplicated(V_topics$USUBJID))) {
stop("Some participants have more than one topic composition at baseline")
}
V_microbiota <-
V_topics |>
select(USUBJID) |>
mutate(
microbiota =
bind_cols(
tibble(Lactobacillus = V_topics$topic |> select(1:4) |> rowSums()),
V_topics$topic |> select(-c(1:4, 8))
)
)In addition to our primary covariates (pre-MTZ subcommunity proportions), we will also estimate effect heterogeneity by the following categorical covariates:
baseline (pre-MTZ) sub-CSTs (correlated with the subcommunities);
pre-MTZ \(\alpha\)-diversity (Shannon index);
post-MTZ BV diagnosis;
self-declared race;
study site; and
contraceptive choices.
V_most_prev_genus <-
get_assay_long_format(mae, "tax_16S_p") |>
filter(AVISITN == 0, ARM %in% arms) |>
select(USUBJID, feature, value) |>
left_join(
SummarizedExperiment::rowData(mae[["tax_16S_p"]]) |>
as.data.frame() |>
rownames_to_column("feature") |>
as_tibble(),
by = join_by(feature)
) |>
group_by(USUBJID, Genus) |>
summarize(rel_ab = sum(value), .groups = "drop") |>
arrange(USUBJID, -rel_ab) |>
group_by(USUBJID) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::rename(most_prev_genus = Genus) |>
arrange(-rel_ab) |>
mutate(most_prev_genus = most_prev_genus |> str_replace("Candidatus Lachnocurva", "Ca. Lachn. (BVAB1)") |> fct_inorder()) |>
arrange(USUBJID) |>
select(-rel_ab)
V_shannon_taxa <-
get_assay_wide_format(mae, "tax_16S_p") |>
filter(AVISITN == 0, ARM %in% arms) |>
mutate(shannon_taxa = vegan::diversity(assay, index = "shannon")) |>
select(USUBJID, shannon_taxa)
V_microbiota <-
V_microbiota |>
mutate(
microbiota_with_div =
microbiota |> mutate(`α-diversity` = V_shannon_taxa$shannon_taxa)
)
V_subCST <-
get_assay_wide_format(mae, "CST") |>
filter(AVISITN == 0, ARM %in% arms) |>
mutate(subCST = assay$subCST) |>
select(USUBJID, subCST)
if (any(duplicated(V_subCST$USUBJID))) {
stop("Some participants have more than one subCST at baseline")
}
V_race <-
clin |>
filter(ARM %in% arms) |>
select(USUBJID, RACEGR2) |>
dplyr::rename(Race = RACEGR2) |>
distinct()
V_site <-
clin |>
filter(ARM %in% arms) |>
select(USUBJID, SITENAME) |>
dplyr::rename(Site = SITENAME) |>
distinct()
V_contraception <-
clin |>
filter(ARM %in% arms) |>
select(USUBJID, BC_W1_24) |>
dplyr::rename(Contraceptive = BC_W1_24) |>
distinct()
V_post_MTZ_BV <-
clin |>
filter(ARM %in% arms, AVISITN == 1) |>
select(USUBJID, AMSEL, NUGENT) |>
distinct() |>
mutate(
AMSEL = AMSEL |> as.character() |> as.numeric(),
NUGENT = NUGENT |> as.character() |> as.numeric(),
amsel_imp = AMSEL |> replace_na(4),
nugent_imp = NUGENT |> replace_na(10),
post_MTZ_BV = (amsel_imp %in% c(3,4)) & (nugent_imp %in% c(7:10))
) |>
dplyr::rename(post_MTZ_Amsel = AMSEL, post_MTZ_Nugent = NUGENT) |>
select(-amsel_imp, -nugent_imp)V <-
V_microbiota |>
full_join(V_topics, by = join_by(USUBJID)) |>
full_join(V_most_prev_genus, by = join_by(USUBJID)) |>
full_join(V_shannon_taxa, by = join_by(USUBJID)) |>
full_join(V_subCST, by = join_by(USUBJID)) |>
full_join(V_race, by = join_by(USUBJID)) |>
full_join(V_site, by = join_by(USUBJID)) |>
full_join(V_contraception, by = join_by(USUBJID)) |>
full_join(V_post_MTZ_BV, by = join_by(USUBJID))
rm(V_topics, V_most_prev_genus, V_microbiota, V_shannon_taxa, V_subCST, V_race, V_site, V_contraception, V_post_MTZ_BV)AYV <- full_join(AY, V, by = join_by(USUBJID))
AYV <- AYV |> mutate(id = row_number())
rm(AY, V)Before investigating effect heterogeneity, we estimate the average causal effects of LACTIN-V on the outcomes of interests (basically replicating NEJM Table 2 and our paper Table 1 results).
To estimate average causal effects (ignoring missing data), we compute risk (or benefit) ratio:
\[\hat{RR} = \frac{\hat{P}(Y = 1 | A = 1)}{\hat{P}(Y = 1 | A = 0)}\] (Null value = 1)
We compute these using the epitools::riskratio function, which computes risk/benefit ratios, Wald’s confidence intervals, and \(p\)-values.
# epitools::riskratio results are then formatted and displayed as a `gt` table using this function
gt_RR_table <- function(res, outcome, title = NULL){
if (outcome |> str_detect("Lc")) {
new_rownames <-
c(
str_c(c("< 50% L. crisp. at week "), parse_number(outcome)),
str_c(c("≥ 50% L. crisp. at week "), parse_number(outcome)),
"Total"
)
} else {
new_rownames <-
c(
str_c(c("no rBV by week "), parse_number(outcome)),
str_c(c("rBV by week "), parse_number(outcome)),
"Total"
)
}
res$data[1:2, ] |>
t() |>
set_rownames(new_rownames) |>
as.data.frame() |>
rownames_to_column("Outcome") |>
mutate(
RR =
c(
str_c(
res$measure[2, 1] |> round(2), " (",
res$measure[2, 2] |> round(2), "–",
res$measure[2, 3] |> round(2), ")"
), "",""
),
`p-value` =
c(
res$p.value[2, 1] |> format.pval(digits = 2),
"",""
)
) |>
gt() |>
gt::cols_label(
RR = ifelse(str_detect(outcome, "Lc"), "Benefit Ratio & 95% CI", "Risk Ratio & 95% CI")
) |>
gt::tab_header(title = title)
}Z <- qnorm(1 - (1 - conf.level)/2)
table(AYV$ARM, AYV$Lc_week_12) |>
epitools::riskratio() |>
gt_RR_table("Lc_week_12", title = "Lactobacillus crispatus colonization at week 12")| Lactobacillus crispatus colonization at week 12 | ||||
|---|---|---|---|---|
| Outcome | Placebo | LACTIN-V | Benefit Ratio & 95% CI | p-value |
| < 50% L. crisp. at week 12 | 51 | 86 | 3.37 (1.4–8.11) | 0.0013 |
| ≥ 50% L. crisp. at week 12 | 5 | 37 | ||
| Total | 56 | 123 | ||
# Seth values: 3.19 (1.32–7.70) for week 12
# they are slightly different than mine
# seth_w12 <-
# matrix(c(51 , 88, 5, 35), 2, 2) |>
# set_rownames(arms) |>
# set_colnames(0:1)
# seth_w12 |> epitools::riskratio() |>
# gt_RR_table("Lc_week_12", title = "Lactobacillus crispatus colonization at week 12 (Seth's numbers)")
# the same if I use his numberstable(AYV$ARM, AYV$Lc_week_24) |>
epitools::riskratio() |>
gt_RR_table("Lc_week_24", title = "Lactobacillus crispatus colonization at week 24")| Lactobacillus crispatus colonization at week 24 | ||||
|---|---|---|---|---|
| Outcome | Placebo | LACTIN-V | Benefit Ratio & 95% CI | p-value |
| < 50% L. crisp. at week 24 | 48 | 74 | 4.49 (1.69–11.9) | 0.00013 |
| ≥ 50% L. crisp. at week 24 | 4 | 39 | ||
| Total | 52 | 113 | ||
# Seth values: 4.37 (1.64–11.61) for week 24
# his numbers are slightly different than mine
# seth_w24 <-
# matrix(c(48 , 75, 4, 38), 2, 2) |>
# set_rownames(arms) |>
# set_colnames(0:1)
#
# seth_w24 |> epitools::riskratio() |>
# gt_RR_table("Lc_week_24", title = "Lactobacillus crispatus colonization at week 24 (Seth's numbers)")
# the same if I use his numberstable(AYV$ARM, AYV$rBV_week_12) |>
epitools::riskratio() |>
gt_RR_table("rBV_week_12", title = "rBV by week 12")| rBV by week 12 | ||||
|---|---|---|---|---|
| Outcome | Placebo | LACTIN-V | Risk Ratio & 95% CI | p-value |
| no rBV by week 12 | 31 | 91 | 0.65 (0.46–0.93) | 0.025 |
| rBV by week 12 | 30 | 43 | ||
| Total | 61 | 134 | ||
# NEJM values 0.66 (0.44–0.87)The NRJM values are slightly different than what we have here because not all participants are included here (samples for some participants were not shipped for the re-analysis.
nejm_data_summary <-
tibble(
ARM = rep(arms, each = 3) |> factor(levels = arms),
rBV = rep(c("rBV", "no rBV", NA), 2),
n = c(34, 30, 12, 46, 87, 19)
)
nejm_data <- nejm_data_summary[rep(1:6, nejm_data_summary$n), 1:2]
nejm_table <- table(nejm_data$ARM, nejm_data$rBV)
epitools::riskratio(nejm_table) |>
gt_RR_table("rBV_week_12", title = "rBV by week 12 (values from Cohen et al., NEJM)")| rBV by week 12 (values from Cohen et al., NEJM) | ||||
|---|---|---|---|---|
| Outcome | Placebo | LACTIN-V | Risk Ratio & 95% CI | p-value |
| no rBV by week 12 | 30 | 87 | 0.65 (0.47–0.9) | 0.015 |
| rBV by week 12 | 34 | 46 | ||
| Total | 64 | 133 | ||
# 0.6510394 (0.4689817 - 0.9037713)
# NEJM values 0.66 (0.44–0.87) # differences likely comes from imputation for missing valuestable(AYV$ARM, AYV$rBV_week_24) |>
epitools::riskratio() |>
gt_RR_table("rBV_week_24", title = "rBV by week 24")| rBV by week 24 | ||||
|---|---|---|---|---|
| Outcome | Placebo | LACTIN-V | Risk Ratio & 95% CI | p-value |
| no rBV by week 24 | 24 | 79 | 0.68 (0.51–0.9) | 0.012 |
| rBV by week 24 | 37 | 55 | ||
| Total | 61 | 134 | ||
nejm_data_summary <-
tibble(
ARM = rep(arms, each = 3) |> factor(levels = arms),
rBV = rep(c("rBV", "no rBV", NA), 2),
n = c(41, 21, 14, 59, 63, 30)
)
nejm_data <- nejm_data_summary[rep(1:6, nejm_data_summary$n), 1:2]
nejm_table <- table(nejm_data$ARM, nejm_data$rBV)
epitools::riskratio(nejm_table) |>
gt_RR_table("rBV_week_24", title = "rBV by week 24 (values from Cohen et al., NEJM)")| rBV by week 24 (values from Cohen et al., NEJM) | ||||
|---|---|---|---|---|
| Outcome | Placebo | LACTIN-V | Risk Ratio & 95% CI | p-value |
| no rBV by week 24 | 21 | 63 | 0.73 (0.57–0.94) | 0.023 |
| rBV by week 24 | 41 | 59 | ||
| Total | 62 | 122 | ||
# 0.6510394 (0.4689817 - 0.9037713)
# NEJM values 0.66 (0.44–0.87) # differences likely comes from imputation for missing valuestopics_long <-
AYV |>
select(USUBJID, topic, most_prev_genus) |>
unnest(topic) |>
pivot_longer(cols = -c(USUBJID, most_prev_genus), names_to = "topic", values_to = "prop") |>
# arrange(most_prev_genus, -prop) |>
mutate(topic = topic |> fct_inorder() |> fct_relevel("Ca. L. v. (BVAB1) topic", after = Inf))topics_long_ordered <-
topics_long |>
filter(!is.na(prop)) |>
left_join(
AYV |> select(USUBJID, ARM),
by = join_by(USUBJID)
) |>
arrange(USUBJID, -prop) |>
group_by(USUBJID) |>
mutate(
dominant_topic = topic[1],
prop_dom = prop[1],
order = weighted.mean(prop, topic |> as.integer())
) |>
ungroup() |>
# arrange(dominant_topic, order) |>
arrange(most_prev_genus, order) |>
mutate(USUBJID = USUBJID |> fct_inorder())outcomes_ordered_filtered <-
AYV |>
filter(USUBJID %in% topics_long_ordered$USUBJID) |>
mutate(
USUBJID = USUBJID |> factor(levels = levels(topics_long_ordered$USUBJID))
) |>
select(USUBJID, ARM, contains("_week_")) |>
pivot_longer(cols = contains("_week_"), names_to = "outcome", values_to = "value") |>
mutate(
Outcome = ifelse(str_detect(outcome, "Lc"), "≥50%\nL.c.", "rBV"),
week = str_c(
ifelse(str_detect(outcome, "Lc"), "at W", "by W"),
outcome |> parse_number()
) |> fct_inorder(),
value = value |> factor()
) |>
group_by(USUBJID) |>
mutate(has_some_obs = !all(is.na(value))) |>
ungroup() |>
filter(has_some_obs)topics_long_ordered_filtered <-
topics_long_ordered |>
filter(USUBJID %in% outcomes_ordered_filtered$USUBJID)g_observed_topics <-
ggplot(topics_long_ordered_filtered) +
aes(x = USUBJID, y = prop, fill = topic) +
geom_col() +
facet_grid(. ~ ARM, scales = "free_x", space = "free_x") +
scale_y_continuous("Pre-MTZ rel. ab.", labels = scales::label_percent()) +
scale_fill_manual(
"Taxon or Topic",
values = get_topic_colors(topics_long$topic |> levels()),
guide = guide_legend(direction = "vertical")
) +
scale_x_discrete(
"Participants, ordered by most prevalent genus and similarity in topic composition",
breaks = NULL
) +
theme(
legend.position = "right"
)
# g_observed_topicsg_observed_outcomes <-
ggplot(outcomes_ordered_filtered) +
aes(x = USUBJID, y = week |> fct_rev(), fill = Outcome, alpha = value) +
geom_tile() +
facet_grid(Outcome ~ ARM, scales = "free", space = "free") +
scale_fill_manual(
"Outcome",
values = get_topic_colors(c("I","IV")),
guide = guide_legend(direction = "vertical")
) +
scale_x_discrete("", breaks = NULL) +
scale_alpha_manual(
"Obs.", values = c(0.2, 1), breaks = c(0, 1),
labels = c("No", "Yes"),
guide = guide_legend(direction = "vertical"),
na.value = 0
) +
ylab("") +
theme(
legend.position = "right"# , strip.text.y = element_text(angle = -90, hjust = 0.5)
)
# g_observed_outcomes(g_observed_outcomes + theme(strip.text.y = element_blank())) +
(g_observed_topics + theme(strip.text.x = element_blank())) +
plot_layout(ncol = 1, height = c(1, 2), guides = "collect") &
theme(legend.position = "right")g_observed_topics_incl <-
topics_long_ordered_filtered |>
filter(most_prev_genus != "Sneathia", most_prev_genus != "Fannyhessea") |>
arrange(most_prev_genus) |>
mutate(mpg = most_prev_genus |> str_replace("actobacillus", "\\.") |> str_replace("Ca. Lachn. \\(BVAB1\\)", "BVAB1") |> fct_inorder()) |>
ggplot() +
aes(x = USUBJID, y = prop, fill = topic) +
geom_col() +
ggh4x::facet_nested(. ~ ARM + mpg, scales = "free_x", space = "free_x") +
scale_y_continuous("Pre-MTZ rel. ab.", labels = scales::label_percent()) +
scale_fill_manual(
"Taxon or Topic",
values = get_topic_colors(topics_long$topic |> levels()),
guide = guide_legend(direction = "vertical")
) +
scale_x_discrete(
"Participants, ordered by most prevalent genus and similarity in topic composition",
breaks = NULL
) +
theme(
legend.position = "right"
)
# g_observed_topicsg_observed_outcomes_incl <-
outcomes_ordered_filtered |>
left_join(topics_long_ordered_filtered |> select(USUBJID, most_prev_genus) |> distinct(), by = "USUBJID") |>
filter(most_prev_genus != "Sneathia", most_prev_genus != "Fannyhessea") |>
arrange(most_prev_genus) |>
mutate(mpg = most_prev_genus |> str_replace("actobacillus", "\\.") |> str_replace("Ca. Lachn. \\(BVAB1\\)", "BVAB1") |> fct_inorder()) |>
ggplot() +
aes(x = USUBJID, y = week |> fct_rev(), fill = Outcome, alpha = value) +
geom_tile() +
ggh4x::facet_nested(Outcome ~ ARM + mpg, scales = "free", space = "free") +
scale_fill_manual(
"Outcome",
values = get_topic_colors(c("I","IV")),
guide = guide_legend(direction = "vertical")
) +
scale_x_discrete("", breaks = NULL) +
scale_alpha_manual(
"Obs.", values = c(0.2, 1), breaks = c(0, 1),
labels = c("No", "Yes"),
guide = guide_legend(direction = "vertical"),
na.value = 0
) +
ylab("") +
theme(
legend.position = "right"# , strip.text.y = element_text(angle = -90, hjust = 0.5)
)
# g_observed_outcomes(g_observed_outcomes_incl + theme(strip.text.y = element_blank())) +
(g_observed_topics_incl + theme(strip.text.x = element_blank())) +
plot_layout(ncol = 1, height = c(1, 2), guides = "collect") &
theme(legend.position = "right")To test for heterogeneity of effects, we fit two logistic regression models with the outcome as the response variable.
\[ y_i \sim \text{Bern}(p_i) \]
\[ p_i = \text{logistic}(\mu_i) \]
\[ m_0: \mu_i = \beta_0 + \beta_1 A_i \]
Where \(A_i\) is the treatment arm of participant \(i\) (0 = Placebo, 1 = Lactin-V).
\[ m_1: \mu_i = \beta_0 + \beta_1 A_i + \sum_k\beta_{2k} V_{ki} + \sum_k\beta_{3k} A_i V_{ki} = f_1(A, \mathbf{V}) \]
Where \(V_{ki}\) is the \(k\)-th microbiota composition variable for participant \(i\) (can be categorical for a “stratification analysis” or continuous, assuming a linear model is biologically relevant).
Note: Since we observed overdispersion in our data, we fit the models using the quasibinomial (instead of the binomial) family to estimate the deviance and standard errors more robustly.
Prior to the fits, participants for which the data for the outcome (\(Y\)), or the microbiota composition (\(\mathbf{V}\)) are missing are excluded for both fits, so that the null and the full models are fitted on the same data.
Heterogeneity of effects is tested by comparing the full model (\(m_1\)) to the null model (\(m_0\)) using an analysis of deviance \(F\) test (given the overdispersion).
glm_family <- "quasibinomial"
test_deviance <- "F" # change to LRT if the binomial family is usedThis is implemented in the fit_outcome_prediction_models function.
fit_outcome_prediction_models
## function (data = AYV, outcome = "Lc_week_12", covariate = "microbiota",
## use_IPW = FALSE, model = "glm", family = "quasibinomial",
## test_deviance = "F")
## {
## if (model != "glm")
## stop("not implemented yet")
## formated_data <- unnest(select(data, USUBJID, ARM, !!sym(outcome),
## !!sym(covariate)), cols = !!sym(covariate), names_sep = "_")
## if (!is.null(ncol(data[[covariate]]))) {
## V_names <- str_c("`", covariate, "_", colnames(data[[covariate]]),
## "`")
## }
## else {
## V_names <- covariate
## }
## filtered_data <- drop_na(formated_data)
## if (!use_IPW) {
## filtered_data$propensity_score <- 0.5
## }
## else {
## filtered_data <- mutate(filtered_data, arm = (ARM !=
## "Placebo") * 1)
## ps_formula <- str_c("arm ~ ", str_c(V_names, collapse = " + "))
## ps_model <- glm(ps_formula, data = filtered_data, family = "binomial")
## filtered_data$propensity_score <- predict(ps_model, type = "response")
## }
## filtered_data <- mutate(filtered_data, w = case_when((!!sym(outcome) ==
## 1) ~ (1/propensity_score), TRUE ~ 1/(1 - propensity_score)))
## m0_formula <- str_c(outcome, " ~ ARM")
## m_formula <- str_c(outcome, " ~ ARM + ", str_c(V_names, collapse = " + "),
## " + ", str_c(str_c(V_names, ":ARM"), collapse = " + "))
## m0 <- glm(m0_formula, data = filtered_data, family = family,
## weights = w)
## m <- glm(m_formula, data = filtered_data, family = family,
## weights = w)
## heterogeneity_test <- anova(m, m0, test = test_deviance)
## list(original_data = data, formated_data = formated_data,
## model_fit_data = filtered_data, outcome = outcome, covariate = covariate,
## V_names = V_names, m0 = m0, m = m, heterogeneity_test = heterogeneity_test)
## }AYV |> filter(!is.na(most_prev_genus)) |> dplyr::count(most_prev_genus, ARM) |> gt(
caption = "Number of participants per most prevalent genus at the pre-MTZ visit"
)| most_prev_genus | ARM | n |
|---|---|---|
| Lactobacillus | Placebo | 4 |
| Lactobacillus | LACTIN-V | 20 |
| Gardnerella | Placebo | 32 |
| Gardnerella | LACTIN-V | 56 |
| Prevotella | Placebo | 23 |
| Prevotella | LACTIN-V | 38 |
| Ca. Lachn. (BVAB1) | Placebo | 9 |
| Ca. Lachn. (BVAB1) | LACTIN-V | 18 |
| Sneathia | LACTIN-V | 5 |
| Fannyhessea | LACTIN-V | 3 |
Strata with less than 2 participants per arm are excluded from the analysis.
plot_stratified_risks(AYV |> filter(!(most_prev_genus %in% c("Sneathia", "Fannyhessea"))), outcomes, strata = "most_prev_genus", min_n_in_each_arm = 2) +
plot_stratified_differences(AYV |> filter(!(most_prev_genus %in% c("Sneathia", "Fannyhessea"))), outcomes, strata = "most_prev_genus", min_n_in_each_arm = 2) +
plot_layout(ncol = 1) &
theme(axis.text.x = element_text(angle = 45, hjust = 1))models_most_prevalent_genus <-
map(
outcomes,
~ fit_outcome_prediction_models(
AYV |> filter(!is.na(most_prev_genus), !(most_prev_genus %in% c("Sneathia", "Fannyhessea"))),
outcome = .x, covariate = "most_prev_genus",
model = "glm", family = glm_family, test_deviance = test_deviance
)
)
models_most_prevalent_genus |>
map(
~ tibble(
outcome = .x$outcome,
`p-value` = .x$heterogeneity_test$Pr[2]
)
) |>
bind_rows() |>
mutate(
`BH adjusted p-value` = p.adjust(`p-value`, method = "BH") |> format.pval(digits = 2),
`BY adjusted p-value` = p.adjust(`p-value`, method = "BY") |> format.pval(digits = 2),
`p-value` = `p-value` |> format.pval(digits = 2)
) |> gt(
caption = "Test for heterogeneity of treatment effects by pre-MTZ most prevalent genus"
) | outcome | p-value | BH adjusted p-value | BY adjusted p-value |
|---|---|---|---|
| Lc_week_12 | 0.11646 | 0.1165 | 0.2426 |
| Lc_week_24 | 0.02196 | 0.0293 | 0.0610 |
| rBV_week_12 | 0.01062 | 0.0212 | 0.0443 |
| rBV_week_24 | 0.00093 | 0.0037 | 0.0077 |
AYV |> dplyr::count(subCST, ARM) |> gt(
caption = "Number of participants per subCST at the pre-MTZ visit"
)| subCST | ARM | n |
|---|---|---|
| III-A | Placebo | 1 |
| III-A | LACTIN-V | 10 |
| III-B | Placebo | 2 |
| III-B | LACTIN-V | 7 |
| IV-A | Placebo | 13 |
| IV-A | LACTIN-V | 38 |
| IV-B | Placebo | 51 |
| IV-B | LACTIN-V | 82 |
| IV-C0 | Placebo | 1 |
| IV-C0 | LACTIN-V | 1 |
| IV-C1 | LACTIN-V | 1 |
| V | LACTIN-V | 1 |
| NA | Placebo | 8 |
| NA | LACTIN-V | 12 |
Categories with less than 2 participants per arm are excluded from the analysis.
models_csts <-
map(
outcomes,
~ fit_outcome_prediction_models(
AYV |> filter(subCST %in% c("IV-A", "IV-B")),
outcome = .x, covariate = "subCST",
model = "glm", family = glm_family, test_deviance = test_deviance
)
)
pvals_csts <-
models_csts |>
map(
~ tibble(
outcome = .x$outcome,
`p-value` = .x$heterogeneity_test$Pr[2]
)
) |>
bind_rows() |>
mutate(
`BH adjusted p-value` = p.adjust(`p-value`, method = "BH") |> scales::pvalue(accuracy = 0.01),
`BY adjusted p-value` = p.adjust(`p-value`, method = "BY") |> scales::pvalue(accuracy = 0.01),
`p-value` = `p-value` |> format.pval(digits = 2)
)
pvals_csts |> gt(
caption = "Test for heterogeneity of treatment effects by pre-MTZ subCSTs"
) | outcome | p-value | BH adjusted p-value | BY adjusted p-value |
|---|---|---|---|
| Lc_week_12 | 0.6716 | 0.67 | >0.99 |
| Lc_week_24 | 0.2145 | 0.29 | 0.60 |
| rBV_week_12 | 0.0554 | 0.11 | 0.23 |
| rBV_week_24 | 0.0086 | 0.03 | 0.07 |
g <-
plot_stratified_risks(AYV, outcomes, strata = "subCST", min_n_in_each_arm = 2) +
plot_stratified_differences(AYV, outcomes, strata = "subCST", min_n_in_each_arm = 2) +
xlab("CST") +
geom_label(
data =
pvals_csts |>
mutate(
outcome_label =
outcome |>
str_replace("Lc_", "≥50% L. crisp. at ") |>
str_replace("rBV_", "any rBV by ") |>
str_replace("week_", "week ")
),
aes(label = str_c("adj. p: ", `BH adjusted p-value`)),
x = -Inf, y = -Inf, hjust = -0.1, vjust = -0.5, color = "black", size = 3, label.size = 0
) +
plot_layout(ncol = 1)
gggsave(g, filename = str_c(fig_out_dir(), "S5.pdf"), width = 10, height = 6, device = cairo_pdf)Before trying to estimate whether there is any heterogeneity in effects by pre-MTZ microbiota composition as expressed as subcommunity relative abundances, we can check whether the distribution of the pre-MTZ microbiota composition is similar between the two arms. If the overlap in distribution is too low, then our downstream analyses will be dubious.
To do so, we estimate propensity scores (the probability for each baseline microbiota composition to be in the treatment (Lactin-V) arm), using a logistic regression model where the response is the treatment arm and the explanatory variable is the pre-MTZ microbiota composition.
# This is done selecting microbiota and arm, making arm a binary variable (1 = Lactin-V, 0 = Placebo), and excluding any participants for whom the arm or the baseline microbiota would be unknown.
formated_data <-
AYV |>
select(USUBJID, ARM, microbiota) |>
unnest(cols = microbiota, names_sep = "_") |>
mutate(arm = (ARM != "Placebo")*1) |>
drop_na()
propensity_scores_model <-
glm(arm ~ ., data = formated_data |> select(-USUBJID, -ARM), family = binomial())
propensity_scores <-
formated_data |>
mutate(
propensity_score = predict(propensity_scores_model, type = "response")
) propensity_scores |>
ggplot(aes(x = propensity_score, fill = ARM)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept = 2/3, linetype = "dashed", alpha = 0.5) +
facet_grid(ARM ~ .) +
xlab("Propensity score") +
scale_fill_manual("Treatment arm", values = get_arm_colors()) +
labs(caption = "The dashed vertical line corresponds to the 2:1 randomization ratio")Overall, the overlap seems to be sufficient, but a few participants are over-represented in the treatment arm and two in the Placebo arm.
topics_long_with_ps <-
topics_long |>
left_join(propensity_scores |> select(USUBJID, ARM, propensity_score), by = join_by(USUBJID)) |>
arrange(propensity_score) |>
mutate(USUBJID = USUBJID |> fct_inorder()) |>
filter(!is.na(ARM))g_baseline <-
topics_long_with_ps |>
ggplot(aes(x = USUBJID, y = prop, fill = topic)) +
geom_col() +
scale_fill_manual(
"Taxon or topic",
values = get_topic_colors(levels(topics_long_with_ps$topic)),
guide = guide_legend(direction = "vertical", ncol = 1)
) +
scale_x_discrete("") +
scale_y_continuous("Relative\nabundance") +
theme(axis.text.x = element_blank())
prop_score_limits <- 2/3 * c(0.85, 1, 1/0.85)
g_prop <-
topics_long_with_ps |>
ggplot(aes(x = USUBJID, y = propensity_score, col = propensity_score)) +
geom_hline(yintercept = prop_score_limits, linetype = "dashed", alpha = 0.5) +
annotate(
"text", x = Inf, y = prop_score_limits[2], label = "2:1 randomization ratio",
hjust = 1.05, vjust = -0.7, size = 3
) +
geom_point(size = 0.5) +
scale_color_gradient2(
"Propensity\nscores",
low = "purple", mid = "steelblue2", high = "red",
midpoint = 2/3, limits = c(1/3, 1), breaks = seq(0, 1, by = 0.5),
guide = guide_colourbar(direction = "vertical")
) +
ylab("Propensity\nscores") +
scale_x_discrete("") +
theme(axis.text.x = element_blank())
g_arm <-
topics_long_with_ps |>
ggplot(aes(x = USUBJID, y = 1, fill = ARM)) +
geom_tile() +
scale_y_continuous("\nArm", breaks = NULL) +
scale_fill_manual("Arm", values = get_arm_colors(), guide = guide_legend(direction = "vertical")) +
theme(axis.text.x = element_blank())g_arm + xlab("") +
g_prop +
g_baseline +
g_arm + xlab("Participants") +
plot_layout(heights = c(1, 2, 4, 1)) &
theme(legend.position = "right")Some microbiota compositions (such as L. iners-dominant ones) are under-represented, but they also have higher propensity scores, indicating that they are more common in the Lactin-V arm than in the Placebo arm.
This imbalance might be further exacerbated when fitting the models as some participants will be excluded due to missing values at the endpoint visits (week 12 or week 24). The fit_outcome_prediction_models function allows to use inverse propensity score weighting to account for this imbalance when fitting the models if desired. However, as this can also introduce biases, we do not use it here.
If there is heterogeneity in effects, we then use the full model predictions to identify subgroups of participants that may benefit more or less from the treatment based on their pre-MTZ microbiota composition (or any other variable we may be interested in checking for heterogeneity).
To do that, we use counterfactual predictions: for all participants for which pre-MTZ microbiota composition is available (even if their outcome is missing), we predict, using the full model, the probabilities (and associated confidence intervals) of their outcome under the treatment (\(\hat{p}_i^{(1)}\)) or the placebo (\(\hat{p}_i^{(0)}\)). We can also compute the log(odds) (and associated confidence intervals) of these probabilities (\(\hat{o}_i^{(1)}\) and \(\hat{o}_i^{(0)}\)), which are the linear predictors of the model under the treatment (\(\hat{\mu}_i^{(1)} = \hat{f}_1(A = 1, \mathbf{V})\)) and placebo arm (\(\hat{\mu}_i^{(0)} = \hat{f}_1(A = 0, \mathbf{V})\)).
We then estimate the “unit-level” log(Odd Ratio) for each participant (\(\log(\hat{\text{OR}}_i)\)), and associated confidence intervals. The participant-level predicted log(Odd Ratios) are computed as \(\log(\hat{\text{OR}}_i) = \log\left(\frac{\hat{o}_i^{(1)}}{\hat{o}_i^{(0)}}\right) = \log(\hat{o}_i^{(1)}) - \log(\hat{o}_i^{(0)}) = \hat{\mu}_i^{(1)} - \hat{\mu}_i^{(0)} = \hat{\Delta}_{\mu_i}\).
Confidence intervals for these participant-level predicted log(Odd Ratios) are computed using the marginaleffects::comparisons function.
citation("marginaleffects")To cite marginaleffects in publications use:
Arel-Bundock V, Greifer N, Heiss A (2024). "How to Interpret
Statistical Models Using marginaleffects for R and Python." _Journal
of Statistical Software_, *111*(9), 1-32. doi:10.18637/jss.v111.i09
<https://doi.org/10.18637/jss.v111.i09>.
A BibTeX entry for LaTeX users is
@Article{,
title = {How to Interpret Statistical Models Using {marginaleffects} for {R} and {Python}},
author = {Vincent Arel-Bundock and Noah Greifer and Andrew Heiss},
journal = {Journal of Statistical Software},
year = {2024},
volume = {111},
number = {9},
pages = {1--32},
doi = {10.18637/jss.v111.i09},
}
TODO: format citation
These participant-level predicted log(Odd Ratios) reflect the predicted effect of the treatment for that participant. If their confidence interval includes 0, this participant is predicted to neither benefit nor be harmed by the treatment. If the confidence interval is entirely above 0, the participant is predicted to benefit from the treatment (if the outcome is a positive outcome such as colonization by L. crispatus), and if it is entirely below 0, the participant is predicted to be harmed by the treatment.
This is implemented in the predict_counterfactuals function.
predict_counterfactuals
## function (models, conf.level = 0.95)
## {
## Z <- qnorm(1 - (1 - conf.level)/2)
## j <- which(!is.na(pull(models$formated_data, str_remove_all(models$V_names[1],
## "`"))))
## res <- mutate(bind_rows(mutate(models$formated_data[j, ],
## type = "actual"), mutate(models$formated_data[j, ], type = "predicted",
## ARM = "LACTIN-V"), mutate(models$formated_data[j, ],
## type = "predicted", ARM = "Placebo")), type = factor(type,
## levels = c("actual", "predicted")), ARM = factor(ARM,
## levels = levels(models$formated_data$ARM)))
## if (str_detect(models$m$family$family, "binomial")) {
## inv_link <- plogis
## }
## else if (models$m$family$family == "poisson") {
## inv_link <- exp
## stop("not implemented yet")
## }
## else {
## stop("not implemented yet")
## }
## domain <- summarize(group_by(pivot_longer(select(models$model_fit_data,
## starts_with(models$covariate)), cols = everything(),
## names_to = "covariate", values_to = "value"), covariate),
## min = min(value), max = max(value))
## res <- mutate(pivot_wider(ungroup(mutate(group_by(select(mutate(left_join(pivot_longer(res,
## cols = starts_with(models$covariate), names_to = "covariate",
## values_to = "value"), domain, by = join_by(covariate)),
## in_domain = (value >= min) & (value <= max)), -min, -max),
## USUBJID), in_domain = all(in_domain))), names_from = covariate,
## values_from = value), in_domain_or_NA = ifelse(!in_domain,
## NA, 1))
## res <- select(mutate(res, mu = predict(models$m, type = "link",
## new = res) * in_domain_or_NA, se_mu = predict(models$m,
## type = "link", new = res, se.fit = TRUE, level = conf.level)$se.fit *
## in_domain_or_NA, se_mu = ifelse(type == "actual", NA,
## se_mu), p_hat = pmin(ifelse(type == "actual", !!sym(models$outcome),
## inv_link(mu)), 1), p_hat_CI_low = pmax(inv_link(mu -
## Z * se_mu), 0), p_hat_CI_high = pmin(inv_link(mu + Z *
## se_mu), 1)), -in_domain_or_NA)
## res <- ungroup(mutate(group_by(arrange(res, USUBJID, desc(type),
## desc(ARM)), USUBJID), delta_mu = mu[1] - mu[2], delta_p = p_hat[1] -
## p_hat[2]))
## me_res <- marginaleffects::comparisons(models$m, newdata = filter(res,
## type == "actual", in_domain), comparison = "difference",
## type = "link", variable = "ARM", conf_level = conf.level)
## me_res_tb <- bind_cols(select(extract(filter(res, type ==
## "actual", in_domain), me_res$rowid, ), USUBJID, ARM,
## models$outcome), tibble(delta_mu = me_res$estimate, delta_mu_CI_low = me_res$conf.low,
## delta_mu_CI_high = me_res$conf.high))
## tmp_check <- mutate(left_join(select(filter(res, type ==
## "actual", in_domain), USUBJID, ARM, delta_mu), me_res_tb,
## by = join_by(USUBJID, ARM)), delta_mu_diff = delta_mu.x -
## delta_mu.y)
## if (!all(abs(tmp_check$delta_mu_diff) < 1e-10)) {
## stop("The delta_mu computed using marginaleffects is not the same as the one computed using the model.")
## }
## res <- left_join(res, select(me_res_tb, USUBJID, delta_mu_CI_low,
## delta_mu_CI_high), by = join_by(USUBJID))
## list(long_res = res, me_res = me_res)
## }prediction_models <-
outcomes |>
map(
~ fit_outcome_prediction_models(
AYV, outcome = .x, covariate = "microbiota", family = glm_family, test_deviance = test_deviance
)
) |>
set_names(outcomes)test_results <-
prediction_models |>
map(
~ tibble(
outcome = .x$outcome,
`p-value` = .x$heterogeneity_test$Pr[2]
)
) |>
bind_rows() |>
mutate(
`BH adjusted p-value` = p.adjust(`p-value`, method = "BH") |> format.pval(digits = 2),
`p-value` = `p-value` |> format.pval(digits = 2)
)
test_results |> gt(
caption = "Test for heterogeneity of treatment effects by pre-MTZ microbiota composition"
) | outcome | p-value | BH adjusted p-value |
|---|---|---|
| Lc_week_12 | 0.6495 | 0.6495 |
| Lc_week_24 | 0.0125 | 0.0167 |
| rBV_week_12 | 0.0099 | 0.0167 |
| rBV_week_24 | 0.0008 | 0.0032 |
These results suggest an absence of effect heterogeneity when considering L. crispatus colonization at week 12 as the outcome, and a significant heterogeneity in treatment effects when considering the other outcomes.
We next visualize the unit-level conditional treatment effects.
counterfactuals_list <-
prediction_models |>
map(
~ predict_counterfactuals(.x)
) |>
set_names(outcomes)First we visualize these with participants ordered by their predicted log(OR):
map2(
.x = counterfactuals_list,
.y = prediction_models,
~ plot_counterfactuals(
counterfactuals = .x,
models = .y
)
)$Lc_week_12
$Lc_week_24
$rBV_week_12
$rBV_week_24
Next, we propose these visualization, ordering participants by their predicted probability of “success” in the Placebo arm and faceted by arm:
map2(
.x = counterfactuals_list,
.y = prediction_models,
~ plot_counterfactuals(
counterfactuals = .x,
models = .y,
order_by = "p_hat",
facet_by = "ARM"
)
)$Lc_week_12
$Lc_week_24
$rBV_week_12
$rBV_week_24
These results show that
“Individually”, no participant is predicted to significantly benefit or be harmed by the treatment since all confidence intervals include 0.
participants with high pre-MTZ proportions of the topic dominated by P. amnii are predicted to benefit the most from the treatment, regardless of the considered outcome because for these participants, their probability of colonizing with L. c. or not having rBV is highly increased in the Lactin-V arm.
participants with high pre-MTZ proportions of Lactobacillus are predicted to benefit from Lactin-V when considering L. crispatus colonization as outcome, but not when considering rBV as outcome. This is likely because these participants are unlikely to have BV again if they started with high proportions of Lactobacillus.
participants with high pre-MTZ proportions of topic dominated by BVAB1 are predicted to benefit least from the treatment, regardless of the considered outcome.
Summarizing these results into a single figure and ordering participants by their predicted effects combined across outcomes, three groups of participants emerge:
combined_predicted_effects <-
counterfactuals_list |>
map(
~ .x$long_res |>
filter(type == "actual") |>
select(USUBJID, ARM, delta_p, delta_mu, delta_mu_CI_low, delta_mu_CI_high)
) |>
bind_rows(.id = "outcome") |>
mutate(
estimated_benefit_difference = ifelse(str_detect(outcome, "Lc"), delta_p, -delta_p),
Outcome = outcome |> str_remove("_week_[0-9]*") |> str_replace("Lc", "≥ 50%\nL. c."),
week =
str_c(
ifelse(str_detect(outcome, "rBV"), "by", "at"),
" W", outcome |> parse_number()
) |> fct_inorder()
) participant_order <-
combined_predicted_effects |>
mutate(
outcome_type = ifelse(str_detect(outcome, "Lc"), "Lc", "rBV"),
) |>
group_by(USUBJID, outcome_type) |>
summarize(mean_benefit = mean(estimated_benefit_difference, na.rm = TRUE), .groups = "drop") |>
group_by(USUBJID) |>
summarize(
max_mean_benefit = max(mean_benefit, na.rm = TRUE),
min_mean_benefit = min(mean_benefit, na.rm = TRUE),
mean_mean_benefit = mean(mean_benefit, na.rm = TRUE),
group =
case_when(
min_mean_benefit > 0 ~ "G1",
(max_mean_benefit > 0) & (abs(min_mean_benefit) < max_mean_benefit/2) ~ "G3", # (abs(diff_mean_benefit) > max_mean_benefit) &
TRUE ~ "G2",
) |> factor()
) |>
arrange(group, -mean_mean_benefit) |>
mutate(USUBJID = USUBJID |> fct_inorder())benefit_range <-
combined_predicted_effects$estimated_benefit_difference |>
abs() |> max() |> round(1) |> add(0.1)
g_effects <-
combined_predicted_effects |>
mutate(USUBJID = USUBJID |> factor(levels = levels(participant_order$USUBJID))) |>
left_join(
participant_order,
by = join_by(USUBJID)
) |>
ggplot() +
aes(x = USUBJID, y = week |> fct_rev(), fill = estimated_benefit_difference) +
geom_tile() +
facet_grid(Outcome ~ group, scale = "free", space = "free_x") +
scale_fill_gradient2(
"Point-predicted\nbenefit",
low = "coral", mid = "white", high = "steelblue1",
midpoint = 0, limits = c(-benefit_range, benefit_range), breaks = c(-benefit_range, 0, benefit_range),
guide = guide_colourbar(direction = "vertical")
) +
ylab("") +
scale_x_discrete("", breaks = NULL) +
theme(legend.position = "right") g_comp <-
topics_long |>
filter(USUBJID %in% participant_order$USUBJID) |>
mutate(USUBJID = USUBJID |> factor(levels = levels(participant_order$USUBJID))) |>
left_join(
participant_order |> select(USUBJID, group),
by = join_by(USUBJID)
) |>
ggplot() +
aes(x = USUBJID, y = prop, fill = topic) +
geom_col() +
facet_grid(. ~ group, scale = "free_x", space = "free_x") +
scale_fill_manual(
"Taxon or topic",
values = get_topic_colors(levels(topics_long$topic)),
guide = guide_legend(direction = "vertical", ncol = 1)
) +
scale_x_discrete("Participants, ordered by average effects within groups", breaks = NULL) +
scale_y_continuous("Pre-MTZ rel. ab.", expand = expansion(add = 0)) +
theme(legend.position = "right")g_effects +
(g_comp + theme(strip.text.x = element_blank())) +
plot_layout(heights = c(1, 2.5), ncol = 1)In the figure above, the “point-predicted benefits” (colors of upper panels) are the estimated differences in the probabilities of the outcomes (e.g., L. crispatus colonization, or absence of rBV) between the Lactin-V and Placebo arms for each participant, with the colors indicating whether the participant is point-predicted to benefit (blue), be harmed (red), or neither (white).
The participants are ordered by their average point-predicted benefit across outcomes, and grouped into three groups:
Consistent with the stratified analysis where no participant was (individually or as a group) predicted to be harmed (all confidence intervals include 0), the point estimates of the predicted log(OR) suggest that some participants (e.g., those with high pre-MTZ BVAB1 proportions) benefit less than their counterpart.
pvals <-
models_most_prevalent_genus |>
map(
~ tibble(
outcome = .x$outcome,
`p-value` = .x$heterogeneity_test$Pr[2]
)
) |>
bind_rows() |>
mutate(
`BH adjusted p-value` = p.adjust(`p-value`, method = "BH") |> scales::pvalue(accuracy = 0.01),
`BY adjusted p-value` = p.adjust(`p-value`, method = "BY") |> format.pval(digits = 2),
`p-value` = `p-value` |> format.pval(digits = 2),
outcome_label =
outcome |>
str_replace("Lc_", "≥50% L. crisp. at ") |>
str_replace("rBV_", "any rBV by ") |>
str_replace("week_", "week ")
) g_strat_LC <-
(
(
plot_stratified_risks(AYV, outcomes[1:2], strata = "most_prev_genus", min_n_in_each_arm = 2) +
guides(col = "none")
) /
(
plot_stratified_differences(AYV, outcomes[1:2], strata = "most_prev_genus", min_n_in_each_arm = 2) +
guides(col = "none") +
geom_label(
data = pvals |> filter(outcome == outcomes[1:2]),
aes(label = str_c("adj. p: ", `BH adjusted p-value`)),
x = Inf, y = Inf, hjust = 1.1, vjust = 1.1, color = "black", size = 3, label.size = 0
)
)
)
g_strat_rBV <-
(
(
plot_stratified_risks(AYV, outcomes[3:4], strata = "most_prev_genus", min_n_in_each_arm = 2)
) /
(
plot_stratified_differences(AYV, outcomes[3:4], strata = "most_prev_genus", min_n_in_each_arm = 2) +
geom_label(
data = pvals |> filter(outcome == outcomes[3:4]),
aes(label = str_c("adj. p: ", `BH adjusted p-value`)),
x = Inf, y = -Inf, hjust = 1.1, vjust = -0.1, color = "black", size = 3, label.size = 0
)
)
) fig_5 <-
# observed data
(
(g_observed_outcomes_incl + guides(fill = "none")) /
(g_observed_topics_incl + theme(strip.text.x = element_blank()))
) /
plot_spacer() /
# stratified analysis
(
(g_strat_LC | g_strat_rBV) + plot_layout(ncol = 2) & theme(axis.text.x = element_text(angle = 45, hjust = 1))
) +
# assembly
plot_annotation(tag_levels = "A") +
plot_layout(nrow = 5, heights = c(1, 2.5, 0.5, 5.5)) &
theme(legend.position = "right")
fig_5ggsave(fig_5, filename = str_c(fig_out_dir(), "Fig5.pdf"), width = 13, height = 11, device = cairo_pdf)Observed primary and secondary outcomes. Each rectangle’s shade indicates the observed outcome (light color for a negative outcome, dark color for a positive outcome, orange hue for L. crispatus colonization, and blue hue for rBV) at both endpoints (week 12, and 24, y-axis) for each participant (x-axis), ordered by dominant subcommunity and microbiota similarity as in panel (b) in both study arms (horizontal panels).
Observed pre-MTZ microbiota composition in terms of relative abundance (y-axis) of each topic (colors).
Rates and associated 95% CI of L. crispatus colonization (≥50% of relative abundance) (y-axis) at week 12 or 24 (horizontal panels) by pre-MTZ microbiota composition categorized in dominant sub-community (x-axis) in each arm (colors). CI are computed using Wilson’s scores given the low sample sizes in some strata.
Rate differences and associated 95% CI between arms (y-axis) for L. crispatus colonization by pre-MTZ microbiota composition categorized in dominant sub-community (x-axis). CI are computed using Wilson’s score method given the low sample sizes in some strata. Color indicate benefit, defined as rate differences (rate in Lactin-V arm - rate in Placebo arm).
Same as (c) but for rBV by week 12 or 24.
Same (d) but for rBV by week 12 or 24. Here, benefit is defined as the opposite rate difference (rate in Placebo arm - rate in Lactin-V arm).
map(
outcomes,
~ plot_risks(
AYV,
outcome = .x,
covariate = "Race",
min_n_each_arm = 5
) +
plot_annotation(
title =
.x |> str_replace("rBV_week_", "rBV by week ") |>
str_replace("Lc_week_", "L. crispatus colonization at week ")
)
)[[1]]
[[2]]
[[3]]
[[4]]
models_race <-
map(
outcomes,
~ fit_outcome_prediction_models(
AYV,
outcome = .x, covariate = "Race", use_IPW = TRUE,
model = "glm", family = glm_family, test_deviance = test_deviance
)
)
models_race |>
map(
~ tibble(
outcome = .x$outcome,
`p-value` = .x$heterogeneity_test$Pr[2]
)
) |>
bind_rows() |>
mutate(
`BH adjusted p-value` = p.adjust(`p-value`, method = "BH") |> format.pval(digits = 2),
`p-value` = `p-value` |> format.pval(digits = 2)
) |> gt(
caption = "Test for heterogeneity of treatment effects by participants' self-reported Race"
) | outcome | p-value | BH adjusted p-value |
|---|---|---|
| Lc_week_12 | 0.20 | 0.57 |
| Lc_week_24 | 0.42 | 0.57 |
| rBV_week_12 | 0.69 | 0.69 |
| rBV_week_24 | 0.37 | 0.57 |
map(
outcomes,
~ plot_risks(
AYV,
outcome = .x,
covariate = "Contraceptive",
min_n_each_arm = 5
) +
plot_annotation(
title =
.x |> str_replace("rBV_week_", "rBV by week ") |>
str_replace("Lc_week_", "L. crispatus colonization at week ")
)
)models_contraception <-
map(
outcomes,
~ fit_outcome_prediction_models(
AYV,
outcome = .x, covariate = "Contraceptive", use_IPW = TRUE,
model = "glm", family = glm_family, test_deviance = test_deviance
)
)
models_contraception |>
map(
~ tibble(
outcome = .x$outcome,
`p-value` = .x$heterogeneity_test$Pr[2]
)
) |>
bind_rows() |>
mutate(
`BH adjusted p-value` = p.adjust(`p-value`, method = "BH") |> format.pval(digits = 2),
`p-value` = `p-value` |> format.pval(digits = 2)
) |> gt(
caption = "Test for heterogeneity of treatment effects by participants' contraceptive"
) | outcome | p-value | BH adjusted p-value |
|---|---|---|
| Lc_week_12 | 0.571 | 0.76 |
| Lc_week_24 | 0.837 | 0.84 |
| rBV_week_12 | 0.199 | 0.40 |
| rBV_week_24 | 0.032 | 0.13 |
map(
outcomes,
~ plot_risks(
AYV,
outcome = .x,
covariate = "Site",
min_n_each_arm = 5
) +
plot_annotation(
title =
.x |> str_replace("rBV_week_", "rBV by week ") |>
str_replace("Lc_week_", "L. crispatus colonization at week ")
)
)[[1]]
[[2]]
[[3]]
[[4]]
models_sites <-
map(
outcomes,
~ fit_outcome_prediction_models(
AYV,
outcome = .x, covariate = "Site", use_IPW = TRUE,
model = "glm", family = glm_family, test_deviance = test_deviance
)
)
models_sites |>
map(
~ tibble(
outcome = .x$outcome,
`p-value` = .x$heterogeneity_test$Pr[2]
)
) |>
bind_rows() |>
mutate(
`BH adjusted p-value` = p.adjust(`p-value`, method = "BH") |> format.pval(digits = 2),
`p-value` = `p-value` |> format.pval(digits = 2)
) |> gt(
caption = "Test for heterogeneity of treatment effects by participants' recruitment site"
) | outcome | p-value | BH adjusted p-value |
|---|---|---|
| Lc_week_12 | 0.313 | 0.417 |
| Lc_week_24 | 0.726 | 0.726 |
| rBV_week_12 | 0.096 | 0.193 |
| rBV_week_24 | 0.017 | 0.067 |
There is no heterogeneity by study site, which is reassuring for the generalizability of the results.
sessionInfo()R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Europe/Brussels
tzcode source: internal
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] ggthemes_5.1.0 geepack_1.3.12 PropCIs_0.3-0 phyloseq_1.50.0
[5] gt_1.0.0 patchwork_1.3.0 magrittr_2.0.3 lubridate_1.9.4
[9] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.4
[13] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.2
[17] tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] Rdpack_2.6.4 polynom_1.4-1
[3] epitools_0.5-10.1 permute_0.9-7
[5] rlang_1.1.6 rBeta2009_1.0.1
[7] ade4_1.7-23 matrixStats_1.5.0
[9] compiler_4.4.2 mgcv_1.9-1
[11] vctrs_0.6.5 reshape2_1.4.4
[13] pkgconfig_2.0.3 crayon_1.5.3
[15] fastmap_1.2.0 backports_1.5.0
[17] XVector_0.46.0 labeling_0.4.3
[19] rmarkdown_2.29 tzdb_0.5.0
[21] UCSC.utils_1.2.0 xfun_0.52
[23] MultiAssayExperiment_1.32.0 zlibbioc_1.52.0
[25] GenomeInfoDb_1.42.3 jsonlite_2.0.0
[27] biomformat_1.34.0 gmp_0.7-5
[29] rhdf5filters_1.18.1 DelayedArray_0.32.0
[31] Rhdf5lib_1.28.0 broom_1.0.8
[33] parallel_4.4.2 cluster_2.1.8.1
[35] R6_2.6.1 stringi_1.8.7
[37] RColorBrewer_1.1-3 GenomicRanges_1.58.0
[39] Rcpp_1.0.14 SummarizedExperiment_1.36.0
[41] iterators_1.0.14 knitr_1.50
[43] IRanges_2.40.1 Matrix_1.7-3
[45] splines_4.4.2 igraph_2.1.4
[47] timechange_0.3.0 tidyselect_1.2.1
[49] rstudioapi_0.17.1 abind_1.4-8
[51] yaml_2.3.10 vegan_2.6-10
[53] codetools_0.2-20 partitions_1.10-9
[55] lattice_0.22-6 plyr_1.8.9
[57] Biobase_2.66.0 withr_3.0.2
[59] marginaleffects_0.27.0 evaluate_1.0.3
[61] survival_3.8-3 xml2_1.3.8
[63] Biostrings_2.74.1 pillar_1.10.2
[65] MatrixGenerics_1.18.1 checkmate_2.3.2
[67] foreach_1.5.2 stats4_4.4.2
[69] insight_1.3.0 generics_0.1.4
[71] S4Vectors_0.44.0 hms_1.1.3
[73] scales_1.4.0 glue_1.8.0
[75] tools_4.4.2 data.table_1.17.4
[77] binGroup2_1.3.1 fs_1.6.6
[79] rhdf5_2.50.2 ape_5.8-1
[81] rbibutils_2.3 colorspace_2.1-1
[83] nlme_3.1-168 GenomeInfoDbData_1.2.13
[85] cli_3.6.5 S4Arrays_1.6.0
[87] gtable_0.3.6 ggh4x_0.3.1
[89] sass_0.4.10 digest_0.6.37
[91] BiocGenerics_0.52.0 SparseArray_1.6.2
[93] htmlwidgets_1.6.4 farver_2.1.2
[95] htmltools_0.5.8.1 multtest_2.62.0
[97] lifecycle_1.0.4 httr_1.4.7
[99] MASS_7.3-65